%run init_notebook.py
from src.utils import *
from src.processing import *
from matplotlib.pyplot import cm
from src.nlp_utils import run_parallel, evalute_optimal_smoothing
from src.utils import save_fig
from src.pymc_modelling import get_samp
tdet = load_pd_df("det_topics.feather")
tlda = load_pd_df("lda_cv_topics2.feather").reset_index()
# for each topic 25 Bayesian spline regression are fit
# source code can be found under src.nlp_utils -> runs in parralel
res = run_parallel(tlda.drop(['id'], axis=1), id_col='date')
This process ran 793.6
dict_compare_az, dict_best_nknot, dict_compare_traces, dict_data_grouped = {}, {}, {}, {}
for r in res:
comp_az, best_k, comp_trace, data_g = r
dict_compare_az.update(comp_az)
dict_best_nknot.update(best_k)
dict_compare_traces.update(comp_trace)
dict_data_grouped.update(data_g)
# for each topic the "optimal" model is calcualted based on the WAIC weight aggregation of individual predictions
dict_trace_optimal_agg = {}
for key in list(dict_compare_traces.keys()):
_dict = {}
_len = list(dict_compare_traces[key].values())[0].posterior['mu'].values.shape[-1]
_trace_data = {
# 'mu' is the mean as predicted by the model, take posterior mean across 200 samples
k: dict_compare_traces[key][k].posterior['mu'].values[:, get_samp(1_000), :].reshape(2*100, _len).T.mean(axis=1)
for k in dict_compare_traces[key].keys()
}
_dict['df'] = pd.DataFrame(_trace_data, index=list(dict_data_grouped.values())[0].index)
# aggregate 25 models by topic according to WAIC weights
_dict['opt_agg'] = _dict['df'] @ dict_compare_az[key].weight
dict_trace_optimal_agg[key] = _dict
# df of optimal aggregation
tlda_ind = pd.DataFrame({k: dict_trace_optimal_agg[k]['opt_agg'] for k in dict_trace_optimal_agg.keys()})
tlda_ind['T_sum_ind_op_all'] = tlda_ind.sum(axis=1)
tlda_ind['T_sum_ind_op'] = tlda_ind.drop(['D_inflation', 'M_policy_counter', 'rest'], axis=1).sum(axis=1)
tlda = tlda.drop(['id'], axis=1)
_dict = dict(zip(tlda.columns, list([float]*tlda.shape[1])))
_dict['date'] = "datetime64[ns]"
tlda = pd_df_astype(tlda, dict_dtypes=_dict)
tlda= tlda.set_index('date')
tlda['W'] = tlda.index.to_period('W').to_timestamp()
ser = tlda.drop(['M_policy', 'D_inflation', 'rest'], axis=1).groupby('W').last().sum(axis=1)
# ser2 = tlda.groupby('W').mean().sum(axis=1)
%%capture
df, mods, trace = evalute_optimal_smoothing(ser, range(5, 200, 10))
trace_data = {
k: trace[k].posterior['mu'].values[:, get_samp(1_000), :].reshape(2*100, ser.shape[0]).T.mean(axis=1)
for k in trace.keys()
}
tlda_sum = pd.DataFrame(ser)
tlda_sum.columns = ['T_sum_raw']
tlda_sum['T_sum_glob'] = (pd.DataFrame(trace_data) @ df.weight).values
tlda_sum['T_sum_glob_op3'] = (pd.DataFrame(trace_data)[df.iloc[:3].index] @ df.iloc[:3].weight).values
tlda_sum['T_sum_glob_op5'] = (pd.DataFrame(trace_data)[df.iloc[:10].index] @ df.iloc[:10].weight).values
topics = load_pd_df('lda_topics_smooth2.feather').reset_index()
topics.set_index('date', inplace=True)
topics_raw = topics[[i for i in topics.columns if "_W" not in i and "_diff" not in i]].copy().drop(['M_policy', 'id',], axis=1)
topics = topics[[i for i in topics.columns if "_W" in i and "_diff" not in i]]
topics.columns = [i.split("_W")[0] for i in topics.columns]
topics = topics.join(topics.drop(['D_inflation', 'M_policy'], axis=1).sum(axis=1).rename('T_sum'))
topics['T_sum_diff_lag'] = topics.T_sum.drop_duplicates().diff().shift(1)
topics = topics.drop_duplicates()
topics = pd_join_freq(topics,
pd.DataFrame(pd_groupby(topics, ['T_sum'], 'M', 'last',).diff().iloc[:,0].rename('T_sum_diff_lag_M'),),
'M')
tlda_sum = tlda_sum.asfreq('D')
tlda_sum.index.name = 'date'
tlda_sum = tlda_sum.join(topics.T_sum.rename('T_sum_ind'), how='left')
tlda_sum = tlda_sum.join(tlda_ind)
tlda_sum['W'] = tlda_sum.index.to_period('W').to_timestamp()
tlda_sum = tlda_sum.groupby('W').last()
tlda_sum = tlda_sum.apply(lambda x: min_max_scale(x))
save_pd_df(tlda_sum, 'tlda_sum.feather')
# compare deterministic topics against LDA CV topics
tdet_g = tdet.drop('id', axis=1).groupby('date').sum() #/ tdet.drop('id', axis=1).groupby('date').count().iloc[:,0].values[:,None]
tdet_g = tdet_g.rolling(15).mean().apply(lambda x: (x - x.min()) / (x.max() - x.min()))
tlda_g = tlda.drop('id', axis=1).groupby('date').sum() #/ tlda.groupby('date').count().iloc[:,0].values[:,None]
tlda_g = tlda_g.rolling(15).mean().apply(lambda x: (x - x.min()) / (x.max() - x.min()))
nrows = tlda_g.shape[1]
fig, axes = plt.subplots(nrows, 1, figsize=(12, 2 * nrows))
color = cm.rainbow(np.linspace(0, 1, nrows))
for i, ax in enumerate(axes.ravel()):
col = tlda_g.columns[i]
try:
ax.plot(tdet_g.loc[:, col], c=color[i], marker='.', )
ax.plot(tlda_g.loc[:, col], c='black', marker='.',)
except Exception as e:
continue
ax.set_title(label=col)
ax.legend(['det', 'lda'])
plt.tight_layout()
save_fig(fig, "02_topics_raw_det_lda.png")
# topic aggregation illustrated graphically
# in red all different models are plotted, their weighted mean is the final model
# grey presents confidence interval around final model
# blue and green show different moving averages
dict_smoothed = {}
dict_plot = {col: [dict_data_grouped[col], dict_compare_traces[col][v]] for col, v in dict_best_nknot.items()}
fig, axes = plt.subplots(len(dict_plot), 1, figsize=(14,3*len(dict_plot)), dpi=200)
for i, _ in enumerate(list(dict_plot.items())):
col, _ = _
data, trace = _
axes.ravel()[i].plot(dict_trace_optimal_agg[col]['df'].values, color='red', alpha=.1)
axes.ravel()[i].plot(dict_trace_optimal_agg[col]['opt_agg'].values, color='red', label='waic model average')# alpha=.2)
axes.ravel()[i].set_title(f"{col}: WAIC model average {col}")
axes.ravel()[i].plot(
trace.posterior['mu'][:, get_samp(1_000, 200)].values.reshape(2*200, data.shape[0]).T,
lw=.1,
alpha=.3,
color='grey',
)
axes.ravel()[i].plot(data.rolling(30).mean().values, lw=0, marker='.', color='green', alpha=.4, label='mov avg 30 days')
axes.ravel()[i].plot(data.rolling(5).mean().values, lw=.5, marker='.', color='blue', alpha=.4, label='mov avg 10 days')
# axes.ravel()[i].plot(trace.posterior['mu'][:, get_samp(1_000)].values.reshape(2*100, data.shape[0]).T.mean(axis=1), color='red')
axes.ravel()[i].legend()
dict_smoothed[col] = trace.posterior['mu'][:, get_samp(1_000)].values.reshape(2*100, data.shape[0]).T.mean(axis=1)
# fig.legend()
plt.tight_layout();
save_fig(fig, "02_topics_smooth_det_lda.png")